{
# process empirical application data
# in every case, we start with raw datasets 
# and process them to generate x, y, and z data.frames 
# (where x and y are to be merged to form z, where z here is human-coded ground truth)
if(exampleNum_i == 2){
  # process example 2 data 
  results_by_x <- by.x <- 'Name'; results_by_y <- by.y <- 'Organization'
  if( resave_ <- F ){
    load('./DataInputs/meetings_application.Rdata') 

    # get ticker data
    {
      library( BatchGetSymbols )
      x$Return <- as.numeric(NA)
      counter_ <- 0; for(symbol_i in x$Symbol){
        print(counter_/nrow(x))
        counter_ <- counter_ + 1
        l.out <- try(BatchGetSymbols(tickers = symbol_i, freq.data = "yearly",
                                     first.date = as.Date("2012-01-01"),
                                     last.date = as.Date("2016-01-01"),
                                     thresh.bad.data = 0.01,
                                     cache.folder = file.path(tempdir(),
                                                              'BGS_Cache') ), T)
        if(class(l.out)!="try-error"){
          x[counter_,]$Return <- mean(l.out$df.tickers$ret.adjusted.prices, na.rm=T)
        }
      }
      x <- as.data.frame(x)
      save.image("./DataInputs/x_withFinancials.Rdata")
      #load("./Downloads/x_withFinancials.Rdata") # check save.image with load()
    }

    # resolve x and y
    x_red <- x; y_red <- y

    # resolve human merge
    x_red[[by.x]] <- tolower(x_red[[by.x]])
    y_red[[by.y]] <- tolower(y_red[[by.y]])
    z_red_human = z
    z_red_human = z_red_human[which(!is.na(z_red_human$Symbol)),]
    z_red_human[['Organization']] <- tolower(z_red_human[['Organization']])
    z_red_human = merge(x_red,z_red_human, by.x='Symbol',by.y='Symbol',all.x=F,all.y=F)
    z_red_human = merge(y_red,z_red_human, by='Organization',all.x=F,all.y=F)
    repeat_vec <- table(z_red_human$Name)[table(z_red_human$Name)>1]
    z_red_human[z_red_human$Name %in% names(repeat_vec),]
    mean( !is.na(z_red_human$Return))

    z_red_human = z_red_human[!duplicated(paste(z_red_human[[by.x]],
                                                z_red_human[[by.y]],sep='_')),]
    z_red_human$stringdist = stringdist::stringdist(z_red_human[[results_by_x]], z_red_human[[results_by_y]],
                                                    method=FuzzyDistanceMeasureMaster,
                                                    q=qgramMaster)
    write.csv(x_red, file = "./DataInputs/x_red_Example2_final.csv");
    write.csv(y_red, file = "./DataInputs/y_red_Example2_final.csv");
    write.csv(z_red_human, file = "./DataInputs/z_red_Example2_final.csv");
  }
  x_red <- read.csv(file = "./DataInputs/x_red_Example2_final.csv")
  y_red <- read.csv(file = "./DataInputs/y_red_Example2_final.csv")
  z_red_human <- read.csv(file = "./DataInputs/z_red_Example2_final.csv")

  if(reduceData==T){print("WARNING: REDUCING DATA");
    x_red = x_red[1:100,];y_red = y_red[1:101,]
  }
}
if(exampleNum_i == 4){
  # example 4 data 
  
  # see https://www.depts.ttu.edu/rawlsbusiness/about/finance/research-seminar/documents/Lobbying_CEF_01Apr2015.pdf
  if(resave_ <- F){
    # get y_red
    {
      yearBounds <- c(2016,2016)
      library(RSQLite)
      con <- DBI::dbConnect(RSQLite::SQLite(), "./DataInputs/opensecretslobbying.sqlite")
      
      ## list all tables
      tables <- dbListTables(con)

      ## exclude sqlite_sequence (contains table information)
      tables <- tables[tables != "sqlite_sequence"]

      lDataFrames <- vector("list", length=length(tables))

      ## create a data.frame for each table
      for (i in seq(along=tables)) {
        lDataFrames[[i]] <- dbGetQuery(conn=con, statement=paste("SELECT * FROM '", tables[[i]], "'", sep=""))
      }
      y_red = (lDataFrames[[1]])#;rm(lDataFrames)
      y_red <- y_red[y_red$Year >= yearBounds[1] & y_red$Year <= yearBounds[2],]
      y_ultorg <- tapply(1:nrow(y_red), y_red$Ultorg,function(ze){
        tmp_ <- y_red[ze,]
        c("MeanContribAmount" = mean(y_red[ze,"Amount"],na.rm=T),
          "TotalContribAmount" = sum(y_red[ze,"Amount"],na.rm=T),
          "NYearsContribute" = length(unique(y_red[ze,"Year"])),
          "MeanContribAmountPerYearContributed" = mean(  tapply(tmp_$Amount, tmp_$Year, sum) ),
          "MeanContribYear" = mean(y_red[ze,"Year"],na.rm=T),
          "Ultorg" = unique(y_red[ze,"Ultorg"]))
      })
      y_ultorg <- as.data.frame( do.call(rbind, y_ultorg) )
      row.names(y_ultorg)<- NULL
      y_ultorg <- y_ultorg[f2n(y_ultorg$TotalContribAmount)>0,]
      y_red <- y_ultorg; by.y<-"Ultorg";rm(y_ultorg); rm ( lDataFrames )
      y_red <- y_red[!duplicated(y_red$Ultorg),]
    }

    # get x_red
    {
      tmp_ <- list.files("./DataInputs/Fortune 1000 Data/")
      years_ <- seq(yearBounds[1],yearBounds[2],by=1)
      x_red <- c(); for(year_ in years_){
        my_xls_loc <- sprintf("./DataInputs/Fortune 1000 Data/Fortune 1000 US List %s_Someka", year_)
        my_xls <- list.files(my_xls_loc)[grepl(list.files(my_xls_loc),pattern="\\.xls")]
        tmp_ <- readxl::read_xlsx(sprintf("%s/%s",my_xls_loc,my_xls))
        tmp_ <- as.data.frame( tmp_[-c(1:5),] )
        colnames(tmp_) <- tmp_[1,]
        tmp_ <- tmp_[-1,]
        tmp_$`Company Name` <- gsub(tmp_$`Company Name`, pattern = "’", replace = "'")
        tmp_$`Company Name` <- tolower(tmp_$`Company Name`)
        tmp_$Year <- year_
        colnames( tmp_ ) <- gsub(colnames(tmp_),  pattern = "\\r", replace = "")
        colnames( tmp_ ) <- gsub(colnames(tmp_),  pattern = "\\$", replace = "")
        colnames( tmp_ ) <- gsub(colnames(tmp_),  pattern = "\\(", replace = "")
        colnames( tmp_ ) <- gsub(colnames(tmp_),  pattern = "\\)", replace = "")
        colnames( tmp_ ) <- gsub(colnames(tmp_),  pattern = "\\n", replace = "")
        colnames(tmp_)[grepl(colnames(tmp_),pattern="Market Value")] <- "MarketValue"
        tmp_[tmp_=="-"] <- NA
        tmp_[,-c(1:2)] <- apply(tmp_[,-c(1:2)],2,f2n)
        tmp_ <- tmp_[,which(colnames(tmp_) != "NA")]
        tmp_ <- tmp_[,!is.na(colnames(tmp_))]
        tmp_ <- tmp_[,order(colnames(tmp_))]
        tmp_ <- tmp_[,!colnames(tmp_) %in% c("Change in Rank")]
        if(length(x_red) > 0){
          tmp_ <- tmp_[,intersect( colnames(x_red),colnames(tmp_) )]
        }
        print(   setdiff(colnames(tmp_),colnames(x_red)) )
        x_red <- rbind(x_red, tmp_)
        #write.csv(file = "./DataInputs/Fortune1000Data.csv",x_red) # checkpoints 
        #write.csv(file = "./DataInputs/OpenSecretsData.csv",y_red) # checkpoints 
      }
      rm(tmp_)

      x_red_ <- tapply(1:nrow(x_red), x_red$`Company Name`, function(ze){
        ze_tmp <- x_red[ze,]
        c("MeanAssets"= mean(ze_tmp$Assetsmillions,na.rm=T),
          "MeanMarketValue"= mean(ze_tmp$MarketValue,na.rm=T),
          "MeanProfit"= mean(ze_tmp$Profitsmillions,na.rm=T),
          "YearsOnList"= length(unique(ze_tmp$Year,na.rm=T)),
          "CompanyName"= unique(ze_tmp$`Company Name`,na.rm=T))
      })
      x_red_ <- do.call(rbind, x_red_ )
      row.names(x_red_) <- NULL
      x_red <- x_red_; rm(x_red_);by.x <- "CompanyName"
      x_red <- as.data.frame( x_red )
      x_red <- x_red[rank(-f2n(x_red$MeanMarketValue))<= 1000,]
    }
    write.csv(file = "./DataInputs/x_red_MarketCap.csv",x_red)
    write.csv(file = "./DataInputs/y_red_Lobby.csv",y_red)
  }
  x_red <- read.csv(file = "./DataInputs/x_red_MarketCap.csv"); results_by_x<-by.x <- "CompanyName"
  y_red <- read.csv(file = "./DataInputs/y_red_Lobby.csv");results_by_y <- by.y <- "Ultorg"
  z_red_human = read.csv(file = "./DataInputs/Fortune1000HumanCoded.csv")
  z_red_human <- z_red_human[!is.na(z_red_human$Ultorg),-1]
  eval(parse(text = sprintf("y_red$%s = tolower(y_red[[by.y]])",by.y)))
  eval(parse(text = sprintf("x_red$%s = tolower(x_red[[by.x]])",by.x)))
  z_red_human <- merge(x = z_red_human, y = y_red, by = by.y,  all.x = T, all.y = F)
  z_red_human <- merge(x = z_red_human, y = x_red, by = by.x,  all.x = T, all.y = F)
}
if(exampleNum_i == 5){
  # process example 5 data 
  maxDist <- 1
  if(T == T){
    # read in data
    z_red_human <- read.csv(file = "./DataInputs/ForeignNames_clean - Sheet1.csv")
    z_red_human$UniqueID <- 1:nrow(z_red_human)
    z_red_human <- z_red_human[z_red_human$Language2=="chinese",]

    # drop latin characters from non-latin names
    z_red_human$Name2 <- gsub(tolower(z_red_human$Name2),pattern='[a-z]',replace="")
    z_red_human$Name2 <- gsub(z_red_human$Name2,pattern='[1-9]',replace="")
    z_red_human$Name2 <- gsub(z_red_human$Name2,pattern='\\&',replace="")
    z_red_human$Name2 <- gsub(z_red_human$Name2,pattern=':',replace="")
    z_red_human$Name2 <- gsub(z_red_human$Name2,pattern='\\)',replace="")
    z_red_human$Name2 <- gsub(z_red_human$Name2,pattern='\\(',replace="")
    z_red_human$Name2 <- trimws(z_red_human$Name2)
    z_red_human <- z_red_human[z_red_human$Name2!="",]

    # create x, y, z
    x_red <- as.data.frame( z_red_human[,c("UniqueID","Name1","Language1")]); results_by_x <- by.x <- "Name1"
    y_red <-as.data.frame( z_red_human[,c("UniqueID","Name2","Language2")]); results_by_y <- by.y <- "Name2"
    z_red_human <- as.data.frame( z_red_human )
    x <- x_red; y <- y_red
  }
}
if(exampleNum_i == 7){
  # process example 7 data 
  
  # first, load in data and define names
  # here, the san francisco ppp data
  x <- read.csv("./DataInputs/ppp_150kplus_sf.csv")[,-1]; results_by_x <- by.x <- "BorrowerName"

  # here, the y combinator data
  y <- read.csv("./DataInputs/ycombinator_sf.csv")[,-1]; results_by_y <- by.y <- "name"

  # process names
  by.x_clean <- paste0(by.x, "_clean")
  by.y_clean <- paste0(by.y, "_clean")
  eval(parse(text = sprintf("x$%s <- tolower(x[[by.x]])",by.x_clean)))
  eval(parse(text = sprintf("y$%s <- tolower(y[[by.y]])",by.y_clean)))

  x[[by.x_clean]] <- gsub(x[[by.x_clean]], pattern = "[[:punct:]]", replace = "")
  y[[by.y_clean]] <- gsub(y[[by.y_clean]], pattern = "[[:punct:]]", replace = "")
  x[[by.x_clean]] <- gsub(x[[by.x_clean]], pattern = "\\s+", replace = " ")
  y[[by.y_clean]] <- gsub(y[[by.y_clean]], pattern = "\\s+", replace = " ")

  ynames <- strsplit(y[[by.y_clean]],split=" ")
  xnames <- strsplit(x[[by.x_clean]],split=" ")
  xy_pairs <- lapply(ynames,function(zer){
    tmp <- which(unlist(lapply(xnames, function(zerr){
       any(zer %in% zerr) })))
    })
  YWithSharedWords <- which(unlist(lapply(xy_pairs,length)) >= 1)
  table(unlist(lapply(xy_pairs,length)))

  # find matches
  ynames[ YWithSharedWords[14] ]
  y[[by.y]][ YWithSharedWords[301:317] ]
  KeyLinker <- matrix(c(
    "platejoy", "PLATEJOY, INC.",
    "rinse", "RINSE INC.",
    "zerocater", "ZEROCATER INC",
    "buoyant aero", "BUOYANT, INC.",
    "sayana", "SAYANA CORP.",
    "rescale", "RESCALE, INC",
    "fathom", "FATHOM, INC.",
    "ridecell", "RIDECELL INC",
    "bento", "BENTO TECHNOLOGIES, INC.",
    "canary technologies", "CANARY TECHNOLOGIES",
    "nebia", "NEBIA INC",
    "apto payments", "APTO PAYMENTS, INC.",
    "smarking", "SMARKING INC",
    "remix", "REMIX SOFTWARE",
    "spire health", "SPIRE INC.",
    "grubmarket", "GRUBMARKET, INC.",
    "extend", "EXTEND, INC.",
    "zinc", "ZINC TECHNOLOGIES INC.",
    "pathrise", "MAVENFORM INC (DBA PATHRISE)",
    "flockjay", "FLOCKJAY, INC.",
    "ravn", "RAVN INC",
    "carrot fertility", "CARROT FERTILITY INC.",
   "vinebox usual beverage co", "VINEBOX INC.",
   "modern health", "MODERN HEALTH INC.",
   "proxy", "PROXY INC.",
   "gigster", "GIGSTER INC",
   "lugg", "LUGG INC.",
   "circle medical", "CIRCLE MEDICAL CARE OF CALIFORNIA",
   "bitmovin", "BITMOVIN, INC.",
   "noredink", "NOREDINK CORP.",
   "lever", "LEVER, INC.",
   "iris automation", "IRIS AUTOMATION INC",
   "legalist",  "LEGALIST, INC.",
   "skip", "SKIP TRANSPORT, INC",
   "teespring", "TEESPRING INC",
   "shone", "SHONE AUTOMATION, INC.",
   "quantstamp", "QUANTSTAMP, INC.",
   "buildzoom", "BUILDZOOM, INC",
   "cheetah", "CHEETAH TECHNOLOGIES, INC",
   "watsi", "WATSI INC",
   "wefunder", "WEFUNDER INC",
   "mixpanel", "MIXPANEL, INC.",
   "cover", "COVER FINANCIAL, INC.",
   "pathmind", "PATHMIND INC.",
   "alphaflow","ALPHAFLOW INC.",
   "revl", "REVL INC.",
   "zenysis","ZENYSIS TECHNOLOGIES INC.",
   "callisto", "CALLISTO",
   "castle", "CASTLE GLOBAL, INC.",
   "blaze","BLAZE",
   "caremessage", "CAREMESSAGE",
   "apero health", "APERO HEALTH, INC.",
   "copia", "GO COPIA, PBC"
  ), byrow = T, ncol = 2)

  # check that these means are 1
  mean(KeyLinker[,1] %in% y[[by.y_clean]])
  mean(KeyLinker[,2] %in% x[[by.x]])

  y_matched <- y[match(KeyLinker[,1], y[[by.y_clean]]),]
  x_matched <- x[match(KeyLinker[,2], x[[by.x]]),]

  # by.x; by.y
  x_red <- x; y_red <- y
  z_red_human <- cbind(x_matched,y_matched)

  # reduce the data and keep only batches from 2017-2024
  keepBatches <- c("S17","W17", "W18", "W19","W20","W21","W22","W23","W24")
  y <- y[y$batch %in% keepBatches,]
  y_red <- y_red[y_red$batch %in% keepBatches,]
  z_red_human <- z_red_human[z_red_human$batch %in% keepBatches,]
}

  # shared post-processing
  eval(parse(text = sprintf("y_red$%s = tolower(y_red[[by.y]])",by.y)))
  eval(parse(text = sprintf("x_red$%s = tolower(x_red[[by.x]])",by.x)))
  eval(parse(text = sprintf("z_red_human$%s = tolower(z_red_human[[by.x]])",by.x)))
  eval(parse(text = sprintf("z_red_human$%s = tolower(z_red_human[[by.y]])",by.y)))

  if(!reduceData){
    if( length(setdiff(unlist(z_red_human[by.x]), unlist(x_red[by.x]) )) > 0 |  length(setdiff(unlist(z_red_human[by.y]), unlist(y_red[by.y]) )) > 0){ stop("PROBLEMS IN Z_HUMAN") } }

  if(ReSaveDataAssets == T){
    x_red$by_x <- x_red[[by.x]]; y_red$by_y <- y_red[[by.y]]
    z_red_human$by_y <- z_red_human[[by.y]]; z_red_human$by_x <- z_red_human[[by.x]]

    if(exampleNum_i == 2){ exampleName <- "Meetings"}
    if(exampleNum_i == 4){ exampleName <- "Fortune1000"}
    if(exampleNum_i == 5){ exampleName <- "CrossLanguage"}
    if(exampleNum_i == 7){ exampleName <- "YCombinator"}
    
    # write final data 
    write.csv(x_red, file = sprintf("./DataInputs/xShare_%sExample.csv", exampleName))
    write.csv(y_red, file = sprintf("./DataInputs/yShare_%sExample.csv", exampleName))
    write.csv(z_red_human, file = sprintf("./DataInputs/zShare_%sExample.csv", exampleName))
  }
}
